Learning new techniques Stat 545

https://stat545.com/basic-data-care.html

Laoding all the Packages needed

library(gapminder)
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ ggplot2 3.2.1     ✔ purrr   0.3.3
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   1.0.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

Exploring the structures and datasets

Different ways to explore a data frame, tibble

str(gapminder)
## Classes 'tbl_df', 'tbl' and 'data.frame':    1704 obs. of  6 variables:
##  $ country  : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ year     : int  1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##  $ lifeExp  : num  28.8 30.3 32 34 36.1 ...
##  $ pop      : int  8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
##  $ gdpPercap: num  779 821 853 836 740 ...
class(gapminder)
## [1] "tbl_df"     "tbl"        "data.frame"
gapminder
## # A tibble: 1,704 x 6
##    country     continent  year lifeExp      pop gdpPercap
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Afghanistan Asia       1952    28.8  8425333      779.
##  2 Afghanistan Asia       1957    30.3  9240934      821.
##  3 Afghanistan Asia       1962    32.0 10267083      853.
##  4 Afghanistan Asia       1967    34.0 11537966      836.
##  5 Afghanistan Asia       1972    36.1 13079460      740.
##  6 Afghanistan Asia       1977    38.4 14880372      786.
##  7 Afghanistan Asia       1982    39.9 12881816      978.
##  8 Afghanistan Asia       1987    40.8 13867957      852.
##  9 Afghanistan Asia       1992    41.7 16317921      649.
## 10 Afghanistan Asia       1997    41.8 22227415      635.
## # … with 1,694 more rows
head(gapminder)
## # A tibble: 6 x 6
##   country     continent  year lifeExp      pop gdpPercap
##   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333      779.
## 2 Afghanistan Asia       1957    30.3  9240934      821.
## 3 Afghanistan Asia       1962    32.0 10267083      853.
## 4 Afghanistan Asia       1967    34.0 11537966      836.
## 5 Afghanistan Asia       1972    36.1 13079460      740.
## 6 Afghanistan Asia       1977    38.4 14880372      786.
tail(gapminder)
## # A tibble: 6 x 6
##   country  continent  year lifeExp      pop gdpPercap
##   <fct>    <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Zimbabwe Africa     1982    60.4  7636524      789.
## 2 Zimbabwe Africa     1987    62.4  9216418      706.
## 3 Zimbabwe Africa     1992    60.4 10704340      693.
## 4 Zimbabwe Africa     1997    46.8 11404948      792.
## 5 Zimbabwe Africa     2002    40.0 11926563      672.
## 6 Zimbabwe Africa     2007    43.5 12311143      470.
as_tibble(iris)
## # A tibble: 150 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>  
##  1          5.1         3.5          1.4         0.2 setosa 
##  2          4.9         3            1.4         0.2 setosa 
##  3          4.7         3.2          1.3         0.2 setosa 
##  4          4.6         3.1          1.5         0.2 setosa 
##  5          5           3.6          1.4         0.2 setosa 
##  6          5.4         3.9          1.7         0.4 setosa 
##  7          4.6         3.4          1.4         0.3 setosa 
##  8          5           3.4          1.5         0.2 setosa 
##  9          4.4         2.9          1.4         0.2 setosa 
## 10          4.9         3.1          1.5         0.1 setosa 
## # … with 140 more rows
names(gapminder)
## [1] "country"   "continent" "year"      "lifeExp"   "pop"       "gdpPercap"
summary(gapminder)
##         country        continent        year         lifeExp     
##  Afghanistan:  12   Africa  :624   Min.   :1952   Min.   :23.60  
##  Albania    :  12   Americas:300   1st Qu.:1966   1st Qu.:48.20  
##  Algeria    :  12   Asia    :396   Median :1980   Median :60.71  
##  Angola     :  12   Europe  :360   Mean   :1980   Mean   :59.47  
##  Argentina  :  12   Oceania : 24   3rd Qu.:1993   3rd Qu.:70.85  
##  Australia  :  12                  Max.   :2007   Max.   :82.60  
##  (Other)    :1632                                                
##       pop              gdpPercap       
##  Min.   :6.001e+04   Min.   :   241.2  
##  1st Qu.:2.794e+06   1st Qu.:  1202.1  
##  Median :7.024e+06   Median :  3531.8  
##  Mean   :2.960e+07   Mean   :  7215.3  
##  3rd Qu.:1.959e+07   3rd Qu.:  9325.5  
##  Max.   :1.319e+09   Max.   :113523.1  
## 
# Simple plot jus to visualize dataset 

plot(lifeExp ~ year, gapminder)

plot(lifeExp ~ gdpPercap, gapminder)

plot(lifeExp ~ log(gdpPercap), gapminder)

head(gapminder$lifeExp)
## [1] 28.801 30.332 31.997 34.020 36.088 38.438
summary(gapminder$lifeExp)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   23.60   48.20   60.71   59.47   70.85   82.60
hist(gapminder$lifeExp)

class(gapminder$continent)
## [1] "factor"
summary(gapminder$continent)
##   Africa Americas     Asia   Europe  Oceania 
##      624      300      396      360       24
levels(gapminder$continent)
## [1] "Africa"   "Americas" "Asia"     "Europe"   "Oceania"
nlevels(gapminder$continent)
## [1] 5

Slowly diving deeper into the learning

table(gapminder$continent)
## 
##   Africa Americas     Asia   Europe  Oceania 
##      624      300      396      360       24
barplot(table(gapminder$continent))

Using Dplyr

It is part of the tidyverse package,

Cmd+Shift+M (Mac).- shortcut for %>%

filter(gapminder, lifeExp < 29)
## # A tibble: 2 x 6
##   country     continent  year lifeExp     pop gdpPercap
##   <fct>       <fct>     <int>   <dbl>   <int>     <dbl>
## 1 Afghanistan Asia       1952    28.8 8425333      779.
## 2 Rwanda      Africa     1992    23.6 7290203      737.
filter(gapminder, country == "Rwanda" , year >1979)
## # A tibble: 6 x 6
##   country continent  year lifeExp     pop gdpPercap
##   <fct>   <fct>     <int>   <dbl>   <int>     <dbl>
## 1 Rwanda  Africa     1982    46.2 5507565      882.
## 2 Rwanda  Africa     1987    44.0 6349365      848.
## 3 Rwanda  Africa     1992    23.6 7290203      737.
## 4 Rwanda  Africa     1997    36.1 7212583      590.
## 5 Rwanda  Africa     2002    43.4 7852401      786.
## 6 Rwanda  Africa     2007    46.2 8860588      863.
filter(gapminder, country %in% c("Rwanda", "Afghanistan"))
## # A tibble: 24 x 6
##    country     continent  year lifeExp      pop gdpPercap
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Afghanistan Asia       1952    28.8  8425333      779.
##  2 Afghanistan Asia       1957    30.3  9240934      821.
##  3 Afghanistan Asia       1962    32.0 10267083      853.
##  4 Afghanistan Asia       1967    34.0 11537966      836.
##  5 Afghanistan Asia       1972    36.1 13079460      740.
##  6 Afghanistan Asia       1977    38.4 14880372      786.
##  7 Afghanistan Asia       1982    39.9 12881816      978.
##  8 Afghanistan Asia       1987    40.8 13867957      852.
##  9 Afghanistan Asia       1992    41.7 16317921      649.
## 10 Afghanistan Asia       1997    41.8 22227415      635.
## # … with 14 more rows
gapminder %>% 
  select(year, lifeExp) %>% 
  head(4)
## # A tibble: 4 x 2
##    year lifeExp
##   <int>   <dbl>
## 1  1952    28.8
## 2  1957    30.3
## 3  1962    32.0
## 4  1967    34.0
gapminder %>% 
  filter(country == "Cambodia") %>% 
  select(year, lifeExp)
## # A tibble: 12 x 2
##     year lifeExp
##    <int>   <dbl>
##  1  1952    39.4
##  2  1957    41.4
##  3  1962    43.4
##  4  1967    45.4
##  5  1972    40.3
##  6  1977    31.2
##  7  1982    51.0
##  8  1987    53.9
##  9  1992    55.8
## 10  1997    56.5
## 11  2002    56.8
## 12  2007    59.7
### Creating a copy of gapminder so no changes are made to the original dataset

(my_gap <- gapminder)
## # A tibble: 1,704 x 6
##    country     continent  year lifeExp      pop gdpPercap
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Afghanistan Asia       1952    28.8  8425333      779.
##  2 Afghanistan Asia       1957    30.3  9240934      821.
##  3 Afghanistan Asia       1962    32.0 10267083      853.
##  4 Afghanistan Asia       1967    34.0 11537966      836.
##  5 Afghanistan Asia       1972    36.1 13079460      740.
##  6 Afghanistan Asia       1977    38.4 14880372      786.
##  7 Afghanistan Asia       1982    39.9 12881816      978.
##  8 Afghanistan Asia       1987    40.8 13867957      852.
##  9 Afghanistan Asia       1992    41.7 16317921      649.
## 10 Afghanistan Asia       1997    41.8 22227415      635.
## # … with 1,694 more rows
my_gap %>% 
  mutate(gdp = pop * gdpPercap)
## # A tibble: 1,704 x 7
##    country     continent  year lifeExp      pop gdpPercap          gdp
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>        <dbl>
##  1 Afghanistan Asia       1952    28.8  8425333      779.  6567086330.
##  2 Afghanistan Asia       1957    30.3  9240934      821.  7585448670.
##  3 Afghanistan Asia       1962    32.0 10267083      853.  8758855797.
##  4 Afghanistan Asia       1967    34.0 11537966      836.  9648014150.
##  5 Afghanistan Asia       1972    36.1 13079460      740.  9678553274.
##  6 Afghanistan Asia       1977    38.4 14880372      786. 11697659231.
##  7 Afghanistan Asia       1982    39.9 12881816      978. 12598563401.
##  8 Afghanistan Asia       1987    40.8 13867957      852. 11820990309.
##  9 Afghanistan Asia       1992    41.7 16317921      649. 10595901589.
## 10 Afghanistan Asia       1997    41.8 22227415      635. 14121995875.
## # … with 1,694 more rows

Replicating data in a row to fit number of levels

Maybe it would be more meaningful to consumers of my tables and figures to stick with GDP per capita. But what if I reported GDP per capita, relative to some benchmark country. Since Canada is my adopted home, I’ll go with that.

I need to create a new variable that is gdpPercap divided by Canadian gdpPercap, taking care that I always divide two numbers that pertain to the same year.

How I achieve this:

1.Filter down to the rows for Canada. 2.Create a new temporary variable in my_gap: 3.Extract the gdpPercap variable from the Canadian data. 4.Replicate it once per country in the dataset, so it has the right length. 5.Divide raw gdpPercap by this Canadian figure. 6.Discard the temporary variable of replicated Canadian gdpPercap

summarize_at() applies the same summary function(s) to multiple variables

my_gap <- gapminder
ctib <- my_gap %>% 
  filter(country == "Canada") 

my_gap <- my_gap %>% 
  mutate (tmp = rep(ctib$gdpPercap, nlevels(country)),
         gdpPercapRel = gdpPercap / tmp,
         tmp = NULL)
head(my_gap)
## # A tibble: 6 x 7
##   country     continent  year lifeExp      pop gdpPercap gdpPercapRel
##   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>        <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333      779.       0.0686
## 2 Afghanistan Asia       1957    30.3  9240934      821.       0.0657
## 3 Afghanistan Asia       1962    32.0 10267083      853.       0.0634
## 4 Afghanistan Asia       1967    34.0 11537966      836.       0.0520
## 5 Afghanistan Asia       1972    36.1 13079460      740.       0.0390
## 6 Afghanistan Asia       1977    38.4 14880372      786.       0.0356
summary (my_gap$gdpPercapRel)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.007236 0.061648 0.171521 0.326659 0.446564 9.534690
### Select 

my_gap %>% 
  filter(country == "Burundi", year > 1996) %>% 
  select( yr = year, lifeExp, gdpPercap) %>% 
  select(gdpPercap, everything())
## # A tibble: 3 x 3
##   gdpPercap    yr lifeExp
##       <dbl> <int>   <dbl>
## 1      463.  1997    45.3
## 2      446.  2002    47.4
## 3      430.  2007    49.6
#How many observations do we have per continent?
my_gap %>% 
  group_by(continent) %>% 
  summarize(n = n())
## # A tibble: 5 x 2
##   continent     n
##   <fct>     <int>
## 1 Africa      624
## 2 Americas    300
## 3 Asia        396
## 4 Europe      360
## 5 Oceania      24
# The tally() function is a convenience function that knows to count rows. It honors groups

my_gap %>% 
  group_by(continent) %>% 
  tally()
## # A tibble: 5 x 2
##   continent     n
##   <fct>     <int>
## 1 Africa      624
## 2 Americas    300
## 3 Asia        396
## 4 Europe      360
## 5 Oceania      24
# The count() function is an even more convenient function that does both grouping and counting

my_gap %>% 
  count(continent)
## # A tibble: 5 x 2
##   continent     n
##   <fct>     <int>
## 1 Africa      624
## 2 Americas    300
## 3 Asia        396
## 4 Europe      360
## 5 Oceania      24
#What if we wanted to add the number of unique countries for each continent?

my_gap %>% 
  group_by(continent) %>% 
  summarize ( n = n(),
              n_countries = n_distinct(country)) 
## # A tibble: 5 x 3
##   continent     n n_countries
##   <fct>     <int>       <int>
## 1 Africa      624          52
## 2 Americas    300          25
## 3 Asia        396          33
## 4 Europe      360          30
## 5 Oceania      24           2
my_gap %>% 
  group_by(continent) %>% 
  summarize(avg_lifeExp = mean(lifeExp))
## # A tibble: 5 x 2
##   continent avg_lifeExp
##   <fct>           <dbl>
## 1 Africa           48.9
## 2 Americas         64.7
## 3 Asia             60.1
## 4 Europe           71.9
## 5 Oceania          74.3
# Let’s compute average and median life expectancy and GDP per capita by continent by year…but only for 1952 and 2007.
my_gap %>% 
  filter(year %in% c(1952,2007)) %>% 
  group_by(continent, year) %>% 
  summarize_at(vars(lifeExp,gdpPercap), list(~mean(.),~median(.)))
## # A tibble: 10 x 6
## # Groups:   continent [5]
##    continent  year lifeExp_mean gdpPercap_mean lifeExp_median
##    <fct>     <int>        <dbl>          <dbl>          <dbl>
##  1 Africa     1952         39.1          1253.           38.8
##  2 Africa     2007         54.8          3089.           52.9
##  3 Americas   1952         53.3          4079.           54.7
##  4 Americas   2007         73.6         11003.           72.9
##  5 Asia       1952         46.3          5195.           44.9
##  6 Asia       2007         70.7         12473.           72.4
##  7 Europe     1952         64.4          5661.           65.9
##  8 Europe     2007         77.6         25054.           78.6
##  9 Oceania    1952         69.3         10298.           69.3
## 10 Oceania    2007         80.7         29810.           80.7
## # … with 1 more variable: gdpPercap_median <dbl>
#Let’s focus just on Asia. What are the minimum and maximum life expectancies seen by year?

my_gap %>% 
  filter(continent == "Asia") %>% 
  group_by(year) %>% 
  summarize ( min_lifeExp = min(lifeExp), max_lifeExp = max(lifeExp))
## # A tibble: 12 x 3
##     year min_lifeExp max_lifeExp
##    <int>       <dbl>       <dbl>
##  1  1952        28.8        65.4
##  2  1957        30.3        67.8
##  3  1962        32.0        69.4
##  4  1967        34.0        71.4
##  5  1972        36.1        73.4
##  6  1977        31.2        75.4
##  7  1982        39.9        77.1
##  8  1987        40.8        78.7
##  9  1992        41.7        79.4
## 10  1997        41.8        80.7
## 11  2002        42.1        82  
## 12  2007        43.8        82.6

Computing with group-wise summaries

Let’s make a new variable that is the years of life expectancy gained (lost) relative to 1952, for each individual country. We group by country and use mutate() to make a new variable. The first() function extracts the first value from a vector. Notice that first() is operating on the vector of life expectancies within each country group.

my_gap %>% 
  group_by(country) %>% 
  select(country, year, lifeExp) %>% 
  mutate(lifeExp_gain = lifeExp - first(lifeExp)) %>% 
  filter(year < 1963)
## # A tibble: 426 x 4
## # Groups:   country [142]
##    country      year lifeExp lifeExp_gain
##    <fct>       <int>   <dbl>        <dbl>
##  1 Afghanistan  1952    28.8         0   
##  2 Afghanistan  1957    30.3         1.53
##  3 Afghanistan  1962    32.0         3.20
##  4 Albania      1952    55.2         0   
##  5 Albania      1957    59.3         4.05
##  6 Albania      1962    64.8         9.59
##  7 Algeria      1952    43.1         0   
##  8 Algeria      1957    45.7         2.61
##  9 Algeria      1962    48.3         5.23
## 10 Angola       1952    30.0         0   
## # … with 416 more rows
my_gap %>%
  filter(continent == "Asia") %>% 
  select(year, country, lifeExp) %>% 
  group_by(year) %>% 
  filter(min_rank(desc(lifeExp)) < 2 | min_rank(lifeExp) < 2 ) %>% 
  arrange(year) %>% 
  print(n = Inf)
## # A tibble: 24 x 3
## # Groups:   year [12]
##     year country     lifeExp
##    <int> <fct>         <dbl>
##  1  1952 Afghanistan    28.8
##  2  1952 Israel         65.4
##  3  1957 Afghanistan    30.3
##  4  1957 Israel         67.8
##  5  1962 Afghanistan    32.0
##  6  1962 Israel         69.4
##  7  1967 Afghanistan    34.0
##  8  1967 Japan          71.4
##  9  1972 Afghanistan    36.1
## 10  1972 Japan          73.4
## 11  1977 Cambodia       31.2
## 12  1977 Japan          75.4
## 13  1982 Afghanistan    39.9
## 14  1982 Japan          77.1
## 15  1987 Afghanistan    40.8
## 16  1987 Japan          78.7
## 17  1992 Afghanistan    41.7
## 18  1992 Japan          79.4
## 19  1997 Afghanistan    41.8
## 20  1997 Japan          80.7
## 21  2002 Afghanistan    42.1
## 22  2002 Japan          82  
## 23  2007 Afghanistan    43.8
## 24  2007 Japan          82.6
##  which country experienced the sharpest 5-year drop in life expectancy

my_gap %>% 
  select(country, year, continent, lifeExp) %>%
  group_by(continent, country) %>%
  mutate( le_delta = lifeExp - lag(lifeExp)) %>%
  summarize( worst_le_delta = min(le_delta, na.rm = TRUE)) %>% 
  top_n(-1 , wt = worst_le_delta) %>% 
  arrange(worst_le_delta)
## # A tibble: 5 x 3
## # Groups:   continent [5]
##   continent country     worst_le_delta
##   <fct>     <fct>                <dbl>
## 1 Africa    Rwanda             -20.4  
## 2 Asia      Cambodia            -9.10 
## 3 Americas  El Salvador         -1.51 
## 4 Europe    Montenegro          -1.46 
## 5 Oceania   Australia            0.170

Chapter 8 - TidyR

Chapter 9 - Writing and reading data

library(tidyverse)
library(fs)
(gap_tsv <- path_package("gapminder","extdata","gapminder.tsv"))
## /Users/alodia/Library/R/3.6/library/gapminder/extdata/gapminder.tsv
## Rectangular data 

gapminder <- read_tsv(gap_tsv)
## Parsed with column specification:
## cols(
##   country = col_character(),
##   continent = col_character(),
##   year = col_double(),
##   lifeExp = col_double(),
##   pop = col_double(),
##   gdpPercap = col_double()
## )
str(gapminder, give.attr = FALSE)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 1704 obs. of  6 variables:
##  $ country  : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ continent: chr  "Asia" "Asia" "Asia" "Asia" ...
##  $ year     : num  1952 1957 1962 1967 1972 ...
##  $ lifeExp  : num  28.8 30.3 32 34 36.1 ...
##  $ pop      : num  8425333 9240934 10267083 11537966 13079460 ...
##  $ gdpPercap: num  779 821 853 836 740 ...
gapminder <- gapminder %>%  
  mutate(country = factor(country),
         continent = factor (continent))

str(gapminder)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 1704 obs. of  6 variables:
##  $ country  : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ year     : num  1952 1957 1962 1967 1972 ...
##  $ lifeExp  : num  28.8 30.3 32 34 36.1 ...
##  $ pop      : num  8425333 9240934 10267083 11537966 13079460 ...
##  $ gdpPercap: num  779 821 853 836 740 ...
# Let’s create a country-level summary of maximum life expectancy

gap_life_exp <- gapminder %>% 
  group_by(country, continent) %>% 
  summarise(life_exp = max(lifeExp)) %>% 
  ungroup()

gap_life_exp
## # A tibble: 142 x 3
##    country     continent life_exp
##    <fct>       <fct>        <dbl>
##  1 Afghanistan Asia          43.8
##  2 Albania     Europe        76.4
##  3 Algeria     Africa        72.3
##  4 Angola      Africa        42.7
##  5 Argentina   Americas      75.3
##  6 Australia   Oceania       81.2
##  7 Austria     Europe        79.8
##  8 Bahrain     Asia          75.6
##  9 Bangladesh  Asia          64.1
## 10 Belgium     Europe        79.4
## # … with 132 more rows
write_csv(gap_life_exp, "gap_life_exp.csv")


head(levels(gap_life_exp$country))
## [1] "Afghanistan" "Albania"     "Algeria"     "Angola"      "Argentina"  
## [6] "Australia"
gap_life_exp <- gap_life_exp %>% 
  mutate(country = fct_reorder(country, life_exp)) 

head(levels(gap_life_exp$country))
## [1] "Sierra Leone" "Angola"       "Afghanistan"  "Liberia"     
## [5] "Rwanda"       "Mozambique"
gap_life_exp$country
##   [1] Afghanistan              Albania                 
##   [3] Algeria                  Angola                  
##   [5] Argentina                Australia               
##   [7] Austria                  Bahrain                 
##   [9] Bangladesh               Belgium                 
##  [11] Benin                    Bolivia                 
##  [13] Bosnia and Herzegovina   Botswana                
##  [15] Brazil                   Bulgaria                
##  [17] Burkina Faso             Burundi                 
##  [19] Cambodia                 Cameroon                
##  [21] Canada                   Central African Republic
##  [23] Chad                     Chile                   
##  [25] China                    Colombia                
##  [27] Comoros                  Congo, Dem. Rep.        
##  [29] Congo, Rep.              Costa Rica              
##  [31] Cote d'Ivoire            Croatia                 
##  [33] Cuba                     Czech Republic          
##  [35] Denmark                  Djibouti                
##  [37] Dominican Republic       Ecuador                 
##  [39] Egypt                    El Salvador             
##  [41] Equatorial Guinea        Eritrea                 
##  [43] Ethiopia                 Finland                 
##  [45] France                   Gabon                   
##  [47] Gambia                   Germany                 
##  [49] Ghana                    Greece                  
##  [51] Guatemala                Guinea                  
##  [53] Guinea-Bissau            Haiti                   
##  [55] Honduras                 Hong Kong, China        
##  [57] Hungary                  Iceland                 
##  [59] India                    Indonesia               
##  [61] Iran                     Iraq                    
##  [63] Ireland                  Israel                  
##  [65] Italy                    Jamaica                 
##  [67] Japan                    Jordan                  
##  [69] Kenya                    Korea, Dem. Rep.        
##  [71] Korea, Rep.              Kuwait                  
##  [73] Lebanon                  Lesotho                 
##  [75] Liberia                  Libya                   
##  [77] Madagascar               Malawi                  
##  [79] Malaysia                 Mali                    
##  [81] Mauritania               Mauritius               
##  [83] Mexico                   Mongolia                
##  [85] Montenegro               Morocco                 
##  [87] Mozambique               Myanmar                 
##  [89] Namibia                  Nepal                   
##  [91] Netherlands              New Zealand             
##  [93] Nicaragua                Niger                   
##  [95] Nigeria                  Norway                  
##  [97] Oman                     Pakistan                
##  [99] Panama                   Paraguay                
## [101] Peru                     Philippines             
## [103] Poland                   Portugal                
## [105] Puerto Rico              Reunion                 
## [107] Romania                  Rwanda                  
## [109] Sao Tome and Principe    Saudi Arabia            
## [111] Senegal                  Serbia                  
## [113] Sierra Leone             Singapore               
## [115] Slovak Republic          Slovenia                
## [117] Somalia                  South Africa            
## [119] Spain                    Sri Lanka               
## [121] Sudan                    Swaziland               
## [123] Sweden                   Switzerland             
## [125] Syria                    Taiwan                  
## [127] Tanzania                 Thailand                
## [129] Togo                     Trinidad and Tobago     
## [131] Tunisia                  Turkey                  
## [133] Uganda                   United Kingdom          
## [135] United States            Uruguay                 
## [137] Venezuela                Vietnam                 
## [139] West Bank and Gaza       Yemen, Rep.             
## [141] Zambia                   Zimbabwe                
## 142 Levels: Sierra Leone Angola Afghanistan Liberia Rwanda ... Japan
### Cleaning up the files that we have created 
file.remove(list.files(pattern = "^gap_life_exp"))
## [1] TRUE

Chapter 10 - Factors & forcasts package

library(tidyverse)
library(gapminder)
str(gapminder$continent)
##  Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
nlevels(gapminder$continent)
## [1] 5
class(gapminder$continent)
## [1] "factor"
gapminder %>% 
  count(continent)
## # A tibble: 5 x 2
##   continent     n
##   <fct>     <int>
## 1 Africa      624
## 2 Americas    300
## 3 Asia        396
## 4 Europe      360
## 5 Oceania      24
fct_count(gapminder$continent)
## # A tibble: 5 x 2
##   f            n
##   <fct>    <int>
## 1 Africa     624
## 2 Americas   300
## 3 Asia       396
## 4 Europe     360
## 5 Oceania     24
### Make sure to drop the factors

nlevels(gapminder$country)
## [1] 142
h_countries <- c("Egypt", "Haiti", "Romania", "Thailand", "Venezuela")

h_gap <- gapminder %>% 
  filter(country %in% h_countries)

nlevels(h_gap$country)
## [1] 142
## where we actually drop the levels of the factor 

h_gap_dropped <- h_gap %>% 
  droplevels()

nlevels(h_gap_dropped$country )
## [1] 5
### use forcats::fct_drop() on a free- range factor 

h_gap$country %>% 
  fct_drop() %>% 
  levels()
## [1] "Egypt"     "Haiti"     "Romania"   "Thailand"  "Venezuela"
### Filter the gapminder data down to rows where population is less than a quarter of a million.Get rid of the unused factor levels for country and continent


fct_count(gapminder$country)
## # A tibble: 142 x 2
##    f               n
##    <fct>       <int>
##  1 Afghanistan    12
##  2 Albania        12
##  3 Algeria        12
##  4 Angola         12
##  5 Argentina      12
##  6 Australia      12
##  7 Austria        12
##  8 Bahrain        12
##  9 Bangladesh     12
## 10 Belgium        12
## # … with 132 more rows
fct_count(gapminder$continent)
## # A tibble: 5 x 2
##   f            n
##   <fct>    <int>
## 1 Africa     624
## 2 Americas   300
## 3 Asia       396
## 4 Europe     360
## 5 Oceania     24
names(gapminder)
## [1] "country"   "continent" "year"      "lifeExp"   "pop"       "gdpPercap"
test_gap <- gapminder %>% 
  filter( pop < 250000)

fct_count(test_gap$continent)
## # A tibble: 5 x 2
##   f            n
##   <fct>    <int>
## 1 Africa      26
## 2 Americas     0
## 3 Asia         7
## 4 Europe       8
## 5 Oceania      0
## should be 3 continents 

test_gap$country %>% 
fct_drop() %>% 
levels()
## [1] "Bahrain"               "Comoros"               "Djibouti"             
## [4] "Equatorial Guinea"     "Iceland"               "Kuwait"               
## [7] "Sao Tome and Principe"
test_gap$continent %>% 
fct_drop() %>% 
levels()
## [1] "Africa" "Asia"   "Europe"
### now trying to save it after getting it done 


test_gap_dropped <- test_gap %>% 
  droplevels()

nlevels(test_gap_dropped$continent) 
## [1] 3
nlevels(test_gap_dropped$country)
## [1] 7
### orderinf factor levels by frequency or by another variable 

#default

gapminder$continent %>% 
  levels()
## [1] "Africa"   "Americas" "Asia"     "Europe"   "Oceania"
## ordering by frequency 

gapminder$continent %>% 
  fct_infreq()  %>% 
  levels()
## [1] "Africa"   "Asia"     "Europe"   "Americas" "Oceania"
## backwards- least 
gapminder$continent %>% 
  fct_infreq() %>% 
  fct_rev %>% 
  levels()
## [1] "Oceania"  "Americas" "Europe"   "Asia"     "Africa"
## order countries by median life expectancy

fct_reorder(gapminder$country, gapminder$lifeExp) %>%
  levels() %>% 
  head()
## [1] "Sierra Leone"  "Guinea-Bissau" "Afghanistan"   "Angola"       
## [5] "Somalia"       "Guinea"
## order accoring to minimum life exp instead of median
fct_reorder(gapminder$country, gapminder$lifeExp, min) %>% 
  levels() %>%
  head()
## [1] "Rwanda"       "Afghanistan"  "Gambia"       "Angola"      
## [5] "Sierra Leone" "Cambodia"
## backwards (highest life exp)
fct_reorder(gapminder$country, gapminder$lifeExp, .desc = TRUE) %>% 
  levels() %>%
  head()
## [1] "Iceland"     "Japan"       "Sweden"      "Switzerland" "Netherlands"
## [6] "Norway"
###### Example of why we reorder factor levels: often makes plots much better! When a factor is mapped to x or y, it should almost always be reordered by the quantitative variable you are mapping to the other one

 gap_asia_2007 <- gapminder %>% 
  filter(year == 2007 , continent == "Asia")

 ggplot(gap_asia_2007,aes(x = lifeExp, y = country)) + geom_point()

ggplot(gap_asia_2007, aes(x = lifeExp, y = fct_reorder(country, lifeExp))) + geom_point()

#Use fct_reorder2() when you have a line chart of a quantitative x against another quantitative y and your factor provides the color. This way the legend appears in some order as the data!

h_countries <- c("Egypt", "Haiti", "Romania", "Thailand", "Venezuela")

h_gap <- gapminder %>% 
  filter(country %in% h_countries) %>% 
  droplevels()

ggplot(h_gap, aes(x= year, y = lifeExp, color = country)) + geom_line()

ggplot(h_gap, aes(x= year, y = lifeExp, color = fct_reorder2(country, year, lifeExp))) + geom_line() + labs(color = "country")

### Stopped at 10.7 too tired to continue ....hmmm..hmmm

## New day Jan 22-2020 - Growing a factor 
df1 <- gapminder %>% 
  filter(country %in% c("United States", "Mexico"), year > 2000) %>% 
  droplevels()

df2 <- gapminder %>% 
  filter(country %in% c("France","Germany"), year > 2000) %>% 
  droplevels()

levels(df1$country)
## [1] "Mexico"        "United States"
levels (df2$country)
## [1] "France"  "Germany"
## combining using fct_c

fct_c(df1$country, df2$country)
## [1] Mexico        Mexico        United States United States France       
## [6] France        Germany       Germany      
## Levels: Mexico United States France Germany

## Chapter 11 - Character vector

Stringr package - manipulating character vectors

Especially useful for functions that split one character vector into many and vice versa: separate(), unite(), extract(). Base functions: nchar(), strsplit(), substr(), paste(), paste0(). The glue package is fantastic for string interpolation. If stringr::str_interp() doesn’t get your job done, check out the glue package.

## Detect or filter on a target string - str_

str_detect(fruit, pattern = "fruit")
##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [34] FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE
## [45] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [56] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [67] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [78] FALSE  TRUE FALSE
fruit
##  [1] "apple"             "apricot"           "avocado"          
##  [4] "banana"            "bell pepper"       "bilberry"         
##  [7] "blackberry"        "blackcurrant"      "blood orange"     
## [10] "blueberry"         "boysenberry"       "breadfruit"       
## [13] "canary melon"      "cantaloupe"        "cherimoya"        
## [16] "cherry"            "chili pepper"      "clementine"       
## [19] "cloudberry"        "coconut"           "cranberry"        
## [22] "cucumber"          "currant"           "damson"           
## [25] "date"              "dragonfruit"       "durian"           
## [28] "eggplant"          "elderberry"        "feijoa"           
## [31] "fig"               "goji berry"        "gooseberry"       
## [34] "grape"             "grapefruit"        "guava"            
## [37] "honeydew"          "huckleberry"       "jackfruit"        
## [40] "jambul"            "jujube"            "kiwi fruit"       
## [43] "kumquat"           "lemon"             "lime"             
## [46] "loquat"            "lychee"            "mandarine"        
## [49] "mango"             "mulberry"          "nectarine"        
## [52] "nut"               "olive"             "orange"           
## [55] "pamelo"            "papaya"            "passionfruit"     
## [58] "peach"             "pear"              "persimmon"        
## [61] "physalis"          "pineapple"         "plum"             
## [64] "pomegranate"       "pomelo"            "purple mangosteen"
## [67] "quince"            "raisin"            "rambutan"         
## [70] "raspberry"         "redcurrant"        "rock melon"       
## [73] "salal berry"       "satsuma"           "star fruit"       
## [76] "strawberry"        "tamarillo"         "tangerine"        
## [79] "ugli fruit"        "watermelon"
## Only keep the actual fruits that match  

my_fruit <- str_subset(fruit, pattern = "fruit")
my_fruit
## [1] "breadfruit"   "dragonfruit"  "grapefruit"   "jackfruit"   
## [5] "kiwi fruit"   "passionfruit" "star fruit"   "ugli fruit"
## substring extraction (and replacement by position)

length (my_fruit)
## [1] 8
str_length(my_fruit)
## [1] 10 11 10  9 10 12 10 10
## first 3 letters
head(fruit) %>% 
  str_sub(1,3) 
## [1] "app" "apr" "avo" "ban" "bel" "bil"
### The regex a.b will match all countries that have an a, followed by any single character, followed by b. Yes, regexes are case sensitive, i.e. “Italy” does not match.
countries <- levels(gapminder$country)
str_subset(countries, pattern = "i.a")
##  [1] "Argentina"                "Bosnia and Herzegovina"  
##  [3] "Burkina Faso"             "Central African Republic"
##  [5] "China"                    "Costa Rica"              
##  [7] "Dominican Republic"       "Hong Kong, China"        
##  [9] "Jamaica"                  "Mauritania"              
## [11] "Nicaragua"                "South Africa"            
## [13] "Swaziland"                "Taiwan"                  
## [15] "Thailand"                 "Trinidad and Tobago"
## Anchors can be included to express where the expression must occur within the string. The ^ indicates the beginning of string and $ indicates the end.

str_subset(countries, pattern = "i.a$")
## [1] "Argentina"              "Bosnia and Herzegovina"
## [3] "China"                  "Costa Rica"            
## [5] "Hong Kong, China"       "Jamaica"               
## [7] "South Africa"
str_subset(my_fruit, pattern = "d")
## [1] "breadfruit"  "dragonfruit"
str_subset(my_fruit, pattern = "^d")
## [1] "dragonfruit"

## 13 - Dates and Times

library(tidyverse)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
Sys.Date()
## [1] "2020-01-23"
today()
## [1] "2020-01-23"
str(Sys.Date())
##  Date[1:1], format: "2020-01-23"
class(Sys.Date())
## [1] "Date"
str(today())
##  Date[1:1], format: "2020-01-23"
class(today())
## [1] "Date"
Sys.time()
## [1] "2020-01-23 13:34:43 CST"
now()
## [1] "2020-01-23 13:34:43 CST"
str(Sys.time())
##  POSIXct[1:1], format: "2020-01-23 13:34:43"
class(Sys.time())
## [1] "POSIXct" "POSIXt"
str(now())
##  POSIXct[1:1], format: "2020-01-23 13:34:43"
class(now())
## [1] "POSIXct" "POSIXt"
### Column binding 

library(gapminder)

life_exp <- gapminder %>% 
  select(country, year,lifeExp)

pop  <- gapminder %>% 
  arrange(year) %>%
  select(pop)

gdp_percap <- gapminder %>% 
  arrange(pop) %>% 
  select(gdpPercap)

(gapminder_garbage <- bind_cols(life_exp, pop,gdp_percap))
## # A tibble: 1,704 x 5
##    country      year lifeExp      pop gdpPercap
##    <fct>       <dbl>   <dbl>    <dbl>     <dbl>
##  1 Afghanistan  1952    28.8  8425333      880.
##  2 Afghanistan  1957    30.3  1282697      861.
##  3 Afghanistan  1962    32.0  9279525     2670.
##  4 Afghanistan  1967    34.0  4232095     1072.
##  5 Afghanistan  1972    36.1 17876956     1385.
##  6 Afghanistan  1977    38.4  8691212     2865.
##  7 Afghanistan  1982    39.9  6927772     1533.
##  8 Afghanistan  1987    40.8   120447     1738.
##  9 Afghanistan  1992    41.7 46886859     3021.
## 10 Afghanistan  1997    41.8  8730405     1890.
## # … with 1,694 more rows
summary(gapminder$lifeExp)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   23.60   48.20   60.71   59.47   70.85   82.60
summary(gapminder_garbage$lifeExp)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   23.60   48.20   60.71   59.47   70.85   82.60
range(gapminder$gdpPercap)
## [1]    241.1659 113523.1329
range(gapminder_garbage$gdpPercap)
## [1]    241.1659 113523.1329

#### 18 - Writing your own functions, part 1

Build that skateboard before you build the car or some fancy car part. A limited-but-functioning thing is very useful. It also keeps the spirits high

The special argument … is very useful when you want the ability to pass arbitrary arguments down to another function, but without constantly expanding the formal arguments to your function. This leaves you with a less cluttered function definition and gives you future flexibility to specify these arguments only when you need to.

Until now, we’ve relied on informal tests of our evolving function. If you are going to use a function a lot, especially if it is part of a package, it is wise to use formal unit tests.

The testthat package (CRAN; GitHub) provides excellent facilities for this, with a distinct emphasis on automated unit testing of entire packages. However, we can take it out for a test drive even with our one measly function.

We will construct a test with test_that()

library(gapminder)
str(gapminder)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 1704 obs. of  6 variables:
##  $ country  : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ year     : num  1952 1957 1962 1967 1972 ...
##  $ lifeExp  : num  28.8 30.3 32 34 36.1 ...
##  $ pop      : num  8425333 9240934 10267083 11537966 13079460 ...
##  $ gdpPercap: num  779 821 853 836 740 ...
min(gapminder$lifeExp)
## [1] 23.599
max(gapminder$lifeExp)
## [1] 82.603
range(gapminder$lifeExp)
## [1] 23.599 82.603
### first function


max_minus_min <- function(x) max(x) - min(x)
max_minus_min(gapminder$lifeExp)
## [1] 59.004
max_minus_min(1:10)
## [1] 9
max_minus_min(runif(1000))
## [1] 0.9917401
### stop if not 

mmm <- function(x) {
  stopifnot(is.numeric(x))
  max(x) - min(x)
} 
mmm(gapminder)
## Error in mmm(gapminder): is.numeric(x) is not TRUE
mmm2 <- function(x){
  if(!is.numeric(x)) {
    stop('I am so sorry, but this function only works for numeric input!\n',
         'You have provided an object of class: ', class(x)[1])
  } 
  max(x)- min(x)
}
mmm2(gapminder)
## Error in mmm2(gapminder): I am so sorry, but this function only works for numeric input!
## You have provided an object of class: spec_tbl_df
quantile(gapminder$lifeExp)
##      0%     25%     50%     75%    100% 
## 23.5990 48.1980 60.7125 70.8455 82.6030
quantile(gapminder$lifeExp, probs = .50)
##     50% 
## 60.7125
median(gapminder$lifeExp)
## [1] 60.7125
quantile(gapminder$lifeExp, probs = c(.25,.75))
##     25%     75% 
## 48.1980 70.8455
boxplot(gapminder$lifeExp, plot = FALSE)$stats
##         [,1]
## [1,] 23.5990
## [2,] 48.1850
## [3,] 60.7125
## [4,] 70.8460
## [5,] 82.6030
### Now write a code snippet that takes the difference between two quantiles.

the_probs <- c(.25,.75)
the_quantiles <- quantile(gapminder$lifeExp, probs = the_probs )
max(the_quantiles) - min(the_quantiles)
## [1] 22.6475
qdiff1 <- function(x,probs) {
  stopifnot(is.numeric(x))
  the_quantiles <- quantile( x = x , probs = probs)
  max(the_quantiles) - min(the_quantiles)
}


qdiff1(gapminder$lifeExp, probs = c(.25,.75))
## [1] 22.6475
IQR(gapminder$lifeExp)
## [1] 22.6475
qdiff3 <- function(x, probs = c(0, 1)) {
  stopifnot(is.numeric(x))
  the_quantiles <- quantile(x, probs)
  max(the_quantiles) - min(the_quantiles)
}
qdiff3(gapminder$lifeExp)
## [1] 59.004
z <- gapminder$lifeExp
z[3] <- NA
quantile(gapminder$lifeExp)
##      0%     25%     50%     75%    100% 
## 23.5990 48.1980 60.7125 70.8455 82.6030
quantile(z)
## Error in quantile.default(z): missing values and NaN's not allowed if 'na.rm' is FALSE
quantile(z, na.rm = TRUE)
##     0%    25%    50%    75%   100% 
## 23.599 48.228 60.765 70.846 82.603
### Dealing with NA'S within our quantile function 

qdiff4 <- function(x, probs = c(0, 1)) {
  stopifnot(is.numeric(x))
  the_quantiles <- quantile(x, probs, na.rm = TRUE)
  max(the_quantiles) - min(the_quantiles)
}
qdiff4(gapminder$lifeExp)
## [1] 59.004
qdiff4(z)
## [1] 59.004
### Dont give defaults, on this function we set na.rm = na.rm 

qdiff5 <- function( x, probs = c(0,1), na.rm = TRUE) {
  stopifnot(is.numeric(x))
  the_quantiles <- quantile(x, probs, na.rm = na.rm)
  max(the_quantiles) - min(the_quantiles)
}
  
qdiff5(gapminder$lifeExp)
## [1] 59.004
qdiff5(z)
## [1] 59.004
 qdiff5(z, na.rm = FALSE) 
## Error in quantile.default(x, probs, na.rm = na.rm): missing values and NaN's not allowed if 'na.rm' is FALSE
 #### we can use ellipsis in function to give the end user freedom to add to the function
 
 qdiff6 <- function( x, probs = c(0,1), na.rm = TRUE, ...) {
   stopifnot(is.numeric(x)) 
   the_quantiles <- quantile(x, probs, na.rm = na.rm )
   max(the_quantiles) - min(the_quantiles)
 }


 
### showing difference between type 1 and type 2  
set.seed(1234)
z <- rnorm(10)
quantile(z, type = 1)
##         0%        25%        50%        75%       100% 
## -2.3456977 -0.8900378 -0.5644520  0.4291247  1.0844412
quantile(z, type = 3)
##         0%        25%        50%        75%       100% 
## -2.3456977 -1.2070657 -0.5644520  0.4291247  1.0844412
all.equal(quantile(z, type = 1), quantile(z,type = 3))
## [1] "Mean relative difference: 0.356196"
 qdiff6(z, probs = c(0.25,.075), type = 1)
## [1] 0.7659078
  qdiff6(z, probs = c(0.25,.075), type = 3)
## [1] 0.7659078
#### Let’s revert to a version of our function that does no NA handling, then test for proper NA handling. We can watch it fail.
  
library(testthat)
## 
## Attaching package: 'testthat'
## The following object is masked from 'package:dplyr':
## 
##     matches
## The following object is masked from 'package:purrr':
## 
##     is_null
## The following object is masked from 'package:tidyr':
## 
##     matches
qdiff_no_NA_no <- function(x, probs = c(0, 1)) {
  the_quantiles <- quantile(x = x, probs = probs)
  max(the_quantiles) - min(the_quantiles)
}

### doesnt work and test that detects it correctly 
test_that('Na handling works' , {
  expect_that(qdiff_no_NA_no(c(1:5,NA)), equals(4))
})
## Error: Test failed: 'Na handling works'
## * missing values and NaN's not allowed if 'na.rm' is FALSE
## 1: expect_that(qdiff_no_NA_no(c(1:5, NA)), equals(4)) at <text>:144
## 2: condition(object)
## 3: expect_equal(x, expected, ..., expected.label = label)
## 4: quasi_label(enquo(object), label, arg = "object")
## 5: eval_bare(get_expr(quo), get_env(quo))
## 6: qdiff_no_NA_no(c(1:5, NA))
## 7: quantile(x = x, probs = probs) at <text>:138
## 8: quantile.default(x = x, probs = probs)
## 9: stop("missing values and NaN's not allowed if 'na.rm' is FALSE")
##works 
qdiff_no_NA <- function(x, probs = c(0, 1), na.rm = TRUE) {
  the_quantiles <- quantile(x = x, probs = probs, na.rm = na.rm)
  max(the_quantiles) - min(the_quantiles)
}


test_that('Na handling works' , {
  expect_that(qdiff_no_NA(c(1:5,NA)), equals(4))
})

Chapter 21 - Function writing practicum

library(gapminder)
library(ggplot2)
library(dplyr)

## getting data to practice 
j_country <- "France"
(j_dat <- gapminder %>% 
  filter(country == j_country))
## # A tibble: 12 x 6
##    country continent  year lifeExp      pop gdpPercap
##    <fct>   <fct>     <dbl>   <dbl>    <dbl>     <dbl>
##  1 France  Europe     1952    67.4 42459667     7030.
##  2 France  Europe     1957    68.9 44310863     8663.
##  3 France  Europe     1962    70.5 47124000    10560.
##  4 France  Europe     1967    71.6 49569000    13000.
##  5 France  Europe     1972    72.4 51732000    16107.
##  6 France  Europe     1977    73.8 53165019    18293.
##  7 France  Europe     1982    74.9 54433565    20294.
##  8 France  Europe     1987    76.3 55630100    22066.
##  9 France  Europe     1992    77.5 57374179    24704.
## 10 France  Europe     1997    78.6 58623428    25890.
## 11 France  Europe     2002    79.6 59925035    28926.
## 12 France  Europe     2007    80.7 61083916    30470.
## Always always always plot the data 

p <- ggplot(j_dat, aes(x = year, y = lifeExp))
p + geom_point() + geom_smooth(method = "lm", se = FALSE)

##Fit the regression  - sanity check - change the intercept 

j_fit <- lm(lifeExp ~ year , j_dat)
coef(j_fit)
##  (Intercept)         year 
## -397.7646019    0.2385014
## coef didnt make sense because it says at year 0 life expectancy was at -397.76 so we will move it to start that the beginning of the data 1952....I() function which “inhibits interpretation/conversion of objects”

j_fit <- lm(lifeExp ~ I(year - 1952), j_dat)
coef(j_fit)
##    (Intercept) I(year - 1952) 
##     67.7901282      0.2385014
### Coefficient of 67.79 makes much more sense 

### turn working code into a function 

le_lin_fit <- function ( dat, offset = 1952) {
  the_fit <- lm(lifeExp ~ I(year - offset), dat)
  coef(the_fit)
}

le_lin_fit(j_dat)
##      (Intercept) I(year - offset) 
##       67.7901282        0.2385014
## its messy and we need to fix the names on it 

le_lin_fit <- function ( dat, offset = 1952) {
  the_fit <- lm(lifeExp ~ I(year - offset), dat)
  setNames(coef(the_fit), c("intercept", "slope"))
}
le_lin_fit(j_dat)
##  intercept      slope 
## 67.7901282  0.2385014
### testing the function on another country 

j_country <- "Zimbabwe"

(j_dat <- gapminder %>% 
  filter( country == j_country))
## # A tibble: 12 x 6
##    country  continent  year lifeExp      pop gdpPercap
##    <fct>    <fct>     <dbl>   <dbl>    <dbl>     <dbl>
##  1 Zimbabwe Africa     1952    48.5  3080907      407.
##  2 Zimbabwe Africa     1957    50.5  3646340      519.
##  3 Zimbabwe Africa     1962    52.4  4277736      527.
##  4 Zimbabwe Africa     1967    54.0  4995432      570.
##  5 Zimbabwe Africa     1972    55.6  5861135      799.
##  6 Zimbabwe Africa     1977    57.7  6642107      686.
##  7 Zimbabwe Africa     1982    60.4  7636524      789.
##  8 Zimbabwe Africa     1987    62.4  9216418      706.
##  9 Zimbabwe Africa     1992    60.4 10704340      693.
## 10 Zimbabwe Africa     1997    46.8 11404948      792.
## 11 Zimbabwe Africa     2002    40.0 11926563      672.
## 12 Zimbabwe Africa     2007    43.5 12311143      470.
### visualizing 

p <- ggplot(j_dat, aes(x = year, y = lifeExp)) + geom_point()
p + geom_smooth(method = "lm", se = FALSE)

## coefficients 

j_fit <- lm(lifeExp ~ year , j_dat)
coefficients(j_fit)
##  (Intercept)         year 
## 236.79819464  -0.09302098
### create a function 

le_lin_fit(j_dat)
##   intercept       slope 
## 55.22124359 -0.09302098
### It’s also a good idea to clean out the workspace, rerun the minimum amount of code, and retest your function. This will help you catch another common mistake: accidentally relying on objects that were lying around in the workspace during development but that are not actually defined in your function nor passed as formal arguments.

rm(list = ls())

le_lin_fit <- function(dat, offset = 1952) {
  the_fit <- lm(lifeExp ~ I(year - offset), dat)
  setNames(coef(the_fit), c("Intercept", "Slope"))
}

le_lin_fit(gapminder %>% filter(country == "Zimbabwe"))
##   Intercept       Slope 
## 55.22124359 -0.09302098
le_lin_fit(gapminder %>% filter(country == "Kenya"))
##  Intercept      Slope 
## 47.0020385  0.2065077

### GGplot tutorial - https://github.com/jennybc/ggplot2-tutorial/blob/master/gapminder-ggplot2-scatterplot.md

library(tibble)
library(ggplot2)
library(gapminder)
gapminder
## # A tibble: 1,704 x 6
##    country     continent  year lifeExp      pop gdpPercap
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Afghanistan Asia       1952    28.8  8425333      779.
##  2 Afghanistan Asia       1957    30.3  9240934      821.
##  3 Afghanistan Asia       1962    32.0 10267083      853.
##  4 Afghanistan Asia       1967    34.0 11537966      836.
##  5 Afghanistan Asia       1972    36.1 13079460      740.
##  6 Afghanistan Asia       1977    38.4 14880372      786.
##  7 Afghanistan Asia       1982    39.9 12881816      978.
##  8 Afghanistan Asia       1987    40.8 13867957      852.
##  9 Afghanistan Asia       1992    41.7 16317921      649.
## 10 Afghanistan Asia       1997    41.8 22227415      635.
## # … with 1,694 more rows
### empty plot - skeletons 

ggplot(gapminder, aes(x = gdpPercap, y = lifeExp))

p <- ggplot(gapminder, aes(x = gdpPercap, y = lifeExp))

### add points 
ggplot(gapminder, aes(x = gdpPercap, y = lifeExp)) + geom_point()

p + geom_point()

## Log transformation 
ggplot(gapminder, aes(x = log10(gdpPercap), y = lifeExp)) + geom_point()

###### but better way to use transformations #####
p + geom_point() + scale_x_log10()

p <- p + geom_point() + scale_x_log10()

## Map Continent as a variable to aesthetic 

p+ geom_point(aes(color = continent))

### Let's add a summary 

plot(gapminder, aes( x = gdpPercap, y = lifeExp , color = continent)) + geom_point() + scale_x_log10()

## NULL
### addressing overplotting: Set Alpha transparancy and size to a particular value 

p+ geom_point()

p + geom_point(alpha = (1/3), size = 3)

## Add a fitted curve or line 

p + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

p + geom_point()+ geom_smooth( lwd = 3 , se = FALSE)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

p + geom_point()+ geom_smooth( lwd = 3 , se = FALSE, method = "lm")

### Let's bring back our continents 

p + aes(color = continent) + geom_point() + geom_smooth( lwd = 3 , se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#### Facetting is just another way to exploit a factor 


p + geom_point(alpha = (1/8) , size = 3) + facet_wrap (~ continent)

p + geom_point(alpha = (1/8) , size = 3) + facet_wrap (~ continent) + geom_smooth( lwd = 2, se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#### Let's plot lifeExp against Year 

ggplot(gapminder, aes( x= year, y = lifeExp , color = continent)) + geom_jitter( alpha = 1/3 , size = 3)

### Making mini plots, split out by continent 

ggplot(gapminder, aes(x = year, y = lifeExp, color = continent)) + facet_wrap(~continent, scales = "free_x") + geom_jitter(alpha = 1/3, size = 3) + scale_color_manual(values = continent_colors)

ggplot(subset(gapminder, continent != "Oceania"), 
       aes(x = year, y = lifeExp , group = country , color =country)) +
  geom_line(lwd = 1 , show_guide = FALSE ) + facet_wrap(~continent) +
  scale_color_manual(values = country_colors) + 
  theme_bw() + theme(strip.text = element_text (size = rel(1.1)))
## Warning: `show_guide` has been deprecated. Please use `show.legend`
## instead.

### Subsetting data 

ggplot(subset (gapminder, country == "Zimbabwe"), aes(x = year, y = lifeExp)) + geom_line() + geom_point()

##can also get the same result with dplyr::filter

suppressPackageStartupMessages(library(dplyr))
ggplot(gapminder %>% filter(country == "Zimbabwe"),
       aes(x = year, y = lifeExp)) + geom_line() + geom_point()

jCountries <- c("Canada", "Rwanda", "Cambodia", "Mexico")
ggplot(subset(gapminder, country %in% jCountries),
       aes(x = year, y = lifeExp, color = country)) + geom_line() + geom_point()

ggplot(gapminder, aes(x = gdpPercap, y = lifeExp)) +
  scale_x_log10() + geom_bin2d()

sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] testthat_2.2.1  lubridate_1.7.4 fs_1.3.1        forcats_0.4.0  
##  [5] stringr_1.4.0   dplyr_0.8.3     purrr_0.3.3     readr_1.3.1    
##  [9] tidyr_1.0.0     tibble_2.1.3    ggplot2_3.2.1   tidyverse_1.3.0
## [13] gapminder_0.3.0
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_0.2.5 xfun_0.10        splines_3.6.1    haven_2.2.0     
##  [5] lattice_0.20-38  colorspace_1.4-1 vctrs_0.2.0      generics_0.0.2  
##  [9] htmltools_0.4.0  mgcv_1.8-28      yaml_2.2.0       utf8_1.1.4      
## [13] rlang_0.4.1      pillar_1.4.2     glue_1.3.1       withr_2.1.2     
## [17] DBI_1.0.0        dbplyr_1.4.2     modelr_0.1.5     readxl_1.3.1    
## [21] lifecycle_0.1.0  munsell_0.5.0    gtable_0.3.0     cellranger_1.1.0
## [25] rvest_0.3.5      evaluate_0.14    labeling_0.3     knitr_1.25      
## [29] fansi_0.4.0      broom_0.5.2      Rcpp_1.0.2       scales_1.0.0    
## [33] backports_1.1.5  jsonlite_1.6     hms_0.5.2        digest_0.6.22   
## [37] stringi_1.4.3    grid_3.6.1       cli_1.1.0        tools_3.6.1     
## [41] magrittr_1.5     lazyeval_0.2.2   crayon_1.3.4     pkgconfig_2.0.3 
## [45] zeallot_0.1.0    Matrix_1.2-17    ellipsis_0.3.0   xml2_1.2.2      
## [49] reprex_0.3.0     assertthat_0.2.1 rmarkdown_1.16   httr_1.4.1      
## [53] rstudioapi_0.10  R6_2.4.0         nlme_3.1-140     compiler_3.6.1